今天我們要賦予 AI 助理「視覺」能力!透過整合 Gemini Vision 和多模態處理技術,助理將能夠「看見」並理解圖片內容,實現文字、圖片、語音的全方位智能互動。
現實世界的資訊不只是文字,還包括:
多模態 AI 助理能夠處理這些複雜的視覺資訊,提供更全面的服務體驗。
multimodal_assistant/
├── main.py # 主程式
├── vision/
│ ├── __init__.py
│ ├── image_analyzer.py # 圖片分析器
│ ├── ocr_processor.py # OCR 文字識別
│ └── vision_utils.py # 視覺工具函數
├── multimodal/
│ ├── __init__.py
│ ├── content_processor.py # 多模態內容處理器
│ └── response_generator.py # 多模態回應生成器
├── workflows/
│ ├── __init__.py
│ └── vision_workflow.py # LangGraph 視覺工作流程
└── utils/
├── __init__.py
└── image_utils.py # 圖片工具函數
import google.generativeai as genai
from PIL import Image
import base64
import io
from typing import Dict, List, Optional, Any
import os
class GeminiVisionAnalyzer:
"""Gemini Vision 圖片分析器"""
def __init__(self):
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
self.vision_model = genai.GenerativeModel('gemini-pro-vision')
def analyze_image(self, image_path: str, prompt: str = "請描述這張圖片的內容") -> Dict:
"""分析圖片內容"""
try:
# 載入圖片
image = Image.open(image_path)
# 使用 Gemini Vision 分析
response = self.vision_model.generate_content([prompt, image])
return {
'success': True,
'analysis': response.text,
'image_info': {
'size': image.size,
'mode': image.mode,
'format': image.format
}
}
except Exception as e:
return {
'success': False,
'error': str(e),
'analysis': None
}
def extract_text_from_image(self, image_path: str) -> Dict:
"""從圖片提取文字 (OCR)"""
prompt = """
請提取這張圖片中的所有文字內容,包括:
1. 主要標題
2. 正文內容
3. 表格數據
4. 標籤和註解
請保持原始格式和結構。
"""
result = self.analyze_image(image_path, prompt)
if result['success']:
return {
'success': True,
'extracted_text': result['analysis'],
'image_info': result['image_info']
}
else:
return result
def analyze_chart(self, image_path: str) -> Dict:
"""分析圖表內容"""
prompt = """
請詳細分析這張圖表,包括:
1. 圖表類型(長條圖、折線圖、圓餅圖等)
2. 數據趨勢和重要發現
3. 軸標籤和數值範圍
4. 主要結論和洞察
請提供結構化的分析報告。
"""
result = self.analyze_image(image_path, prompt)
if result['success']:
return {
'success': True,
'chart_analysis': result['analysis'],
'image_info': result['image_info']
}
else:
return result
def identify_objects(self, image_path: str) -> Dict:
"""識別圖片中的物件"""
prompt = """
請識別這張圖片中的所有物件和元素,包括:
1. 主要物體(人物、動物、物品)
2. 場景環境(室內/戶外、地點類型)
3. 顏色和風格特徵
4. 可能的用途或情境
請以清單格式列出識別結果。
"""
result = self.analyze_image(image_path, prompt)
if result['success']:
return {
'success': True,
'objects': result['analysis'],
'image_info': result['image_info']
}
else:
return result
from vision.image_analyzer import GeminiVisionAnalyzer
import google.generativeai as genai
from typing import Dict, List, Any, Optional
import os
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
text_model = genai.GenerativeModel('gemini-pro')
class MultimodalProcessor:
"""多模態內容處理器"""
def __init__(self):
self.vision_analyzer = GeminiVisionAnalyzer()
def process_text_and_image(self, text: str, image_path: str) -> Dict:
"""處理文字和圖片的組合請求"""
try:
# 分析圖片
image_analysis = self.vision_analyzer.analyze_image(
image_path,
"請詳細描述這張圖片的內容"
)
if not image_analysis['success']:
return {
'success': False,
'error': '圖片分析失敗',
'details': image_analysis['error']
}
# 結合文字和圖片分析生成回應
combined_prompt = f"""
使用者問題:{text}
圖片內容分析:
{image_analysis['analysis']}
請基於圖片內容回答使用者的問題,提供詳細且準確的資訊。
"""
response = text_model.generate_content(combined_prompt)
return {
'success': True,
'response': response.text,
'image_analysis': image_analysis['analysis'],
'processing_type': 'multimodal'
}
except Exception as e:
return {
'success': False,
'error': str(e),
'processing_type': 'multimodal'
}
def compare_images(self, image1_path: str, image2_path: str, comparison_aspect: str = "general") -> Dict:
"""比較兩張圖片"""
try:
# 分析第一張圖片
analysis1 = self.vision_analyzer.analyze_image(image1_path)
# 分析第二張圖片
analysis2 = self.vision_analyzer.analyze_image(image2_path)
if not (analysis1['success'] and analysis2['success']):
return {
'success': False,
'error': '圖片分析失敗'
}
# 生成比較報告
comparison_prompt = f"""
請比較以下兩張圖片,重點關注{comparison_aspect}方面:
圖片1分析:
{analysis1['analysis']}
圖片2分析:
{analysis2['analysis']}
請提供詳細的比較報告,包括相似點、差異點和主要發現。
"""
comparison = text_model.generate_content(comparison_prompt)
return {
'success': True,
'comparison': comparison.text,
'image1_analysis': analysis1['analysis'],
'image2_analysis': analysis2['analysis']
}
except Exception as e:
return {
'success': False,
'error': str(e)
}
def generate_image_summary(self, image_paths: List[str]) -> Dict:
"""生成多張圖片的整合摘要"""
try:
analyses = []
for i, image_path in enumerate(image_paths):
result = self.vision_analyzer.analyze_image(image_path)
if result['success']:
analyses.append(f"圖片{i+1}:{result['analysis']}")
else:
analyses.append(f"圖片{i+1}:分析失敗")
if not analyses:
return {
'success': False,
'error': '沒有成功分析的圖片'
}
# 生成整合摘要
summary_prompt = f"""
請基於以下多張圖片的分析結果,生成一份整合摘要:
{chr(10).join(analyses)}
摘要應包括:
1. 共同主題或特徵
2. 主要內容概述
3. 重要發現和洞察
4. 建議的後續行動
"""
summary = text_model.generate_content(summary_prompt)
return {
'success': True,
'summary': summary.text,
'individual_analyses': analyses,
'total_images': len(image_paths)
}
except Exception as e:
return {
'success': False,
'error': str(e)
}
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Literal, Optional
from multimodal.content_processor import MultimodalProcessor
from vision.image_analyzer import GeminiVisionAnalyzer
class VisionWorkflowState(TypedDict):
user_query: str
image_paths: List[str]
processing_mode: str # analyze, ocr, chart, compare, multimodal
vision_results: Dict
final_response: str
confidence: float
# 初始化處理器
multimodal_processor = MultimodalProcessor()
vision_analyzer = GeminiVisionAnalyzer()
def analyze_request_type(state: VisionWorkflowState) -> VisionWorkflowState:
"""分析請求類型"""
query = state["user_query"].lower()
# 根據關鍵詞判斷處理模式
if any(keyword in query for keyword in ['文字', '字', 'ocr', '提取']):
mode = 'ocr'
elif any(keyword in query for keyword in ['圖表', '圖形', 'chart', '數據']):
mode = 'chart'
elif any(keyword in query for keyword in ['比較', 'compare', '對比']):
mode = 'compare'
elif len(state["image_paths"]) > 0 and state["user_query"]:
mode = 'multimodal'
else:
mode = 'analyze'
return {
**state,
"processing_mode": mode
}
def analyze_single_image(state: VisionWorkflowState) -> VisionWorkflowState:
"""分析單張圖片"""
if not state["image_paths"]:
return {
**state,
"vision_results": {"error": "沒有提供圖片"},
"final_response": "❌ 請提供要分析的圖片",
"confidence": 0.0
}
image_path = state["image_paths"][0]
result = vision_analyzer.analyze_image(image_path, state["user_query"] or "請描述這張圖片")
if result['success']:
response = f"🖼️ **圖片分析結果**\n\n{result['analysis']}"
confidence = 0.8
else:
response = f"❌ 圖片分析失敗:{result['error']}"
confidence = 0.1
return {
**state,
"vision_results": result,
"final_response": response,
"confidence": confidence
}
def extract_text_from_image(state: VisionWorkflowState) -> VisionWorkflowState:
"""從圖片提取文字"""
if not state["image_paths"]:
return {
**state,
"vision_results": {"error": "沒有提供圖片"},
"final_response": "❌ 請提供要處理的圖片",
"confidence": 0.0
}
image_path = state["image_paths"][0]
result = vision_analyzer.extract_text_from_image(image_path)
if result['success']:
response = f"📝 **提取的文字內容**\n\n{result['extracted_text']}"
confidence = 0.9
else:
response = f"❌ 文字提取失敗:{result['error']}"
confidence = 0.1
return {
**state,
"vision_results": result,
"final_response": response,
"confidence": confidence
}
def analyze_chart(state: VisionWorkflowState) -> VisionWorkflowState:
"""分析圖表"""
if not state["image_paths"]:
return {
**state,
"vision_results": {"error": "沒有提供圖片"},
"final_response": "❌ 請提供要分析的圖表",
"confidence": 0.0
}
image_path = state["image_paths"][0]
result = vision_analyzer.analyze_chart(image_path)
if result['success']:
response = f"📊 **圖表分析報告**\n\n{result['chart_analysis']}"
confidence = 0.85
else:
response = f"❌ 圖表分析失敗:{result['error']}"
confidence = 0.1
return {
**state,
"vision_results": result,
"final_response": response,
"confidence": confidence
}
def process_multimodal_request(state: VisionWorkflowState) -> VisionWorkflowState:
"""處理多模態請求"""
if not state["image_paths"] or not state["user_query"]:
return {
**state,
"vision_results": {"error": "缺少圖片或問題"},
"final_response": "❌ 請提供圖片和具體問題",
"confidence": 0.0
}
image_path = state["image_paths"][0]
result = multimodal_processor.process_text_and_image(state["user_query"], image_path)
if result['success']:
response = f"🤖 **AI 分析回應**\n\n{result['response']}"
confidence = 0.85
else:
response = f"❌ 處理失敗:{result['error']}"
confidence = 0.1
return {
**state,
"vision_results": result,
"final_response": response,
"confidence": confidence
}
def compare_images(state: VisionWorkflowState) -> VisionWorkflowState:
"""比較圖片"""
if len(state["image_paths"]) < 2:
return {
**state,
"vision_results": {"error": "需要至少兩張圖片"},
"final_response": "❌ 請提供至少兩張圖片進行比較",
"confidence": 0.0
}
result = multimodal_processor.compare_images(
state["image_paths"][0],
state["image_paths"][1]
)
if result['success']:
response = f"🔍 **圖片比較結果**\n\n{result['comparison']}"
confidence = 0.8
else:
response = f"❌ 比較失敗:{result['error']}"
confidence = 0.1
return {
**state,
"vision_results": result,
"final_response": response,
"confidence": confidence
}
def route_vision_processing(state: VisionWorkflowState) -> Literal["analyze", "ocr", "chart", "multimodal", "compare"]:
"""路由視覺處理模式"""
return state["processing_mode"]
def create_vision_workflow():
"""建立視覺處理工作流程"""
workflow = StateGraph(VisionWorkflowState)
# 添加節點
workflow.add_node("analyze_request", analyze_request_type)
workflow.add_node("analyze", analyze_single_image)
workflow.add_node("ocr", extract_text_from_image)
workflow.add_node("chart", analyze_chart)
workflow.add_node("multimodal", process_multimodal_request)
workflow.add_node("compare", compare_images)
# 設定流程
workflow.set_entry_point("analyze_request")
# 條件路由
workflow.add_conditional_edges(
"analyze_request",
route_vision_processing,
{
"analyze": "analyze",
"ocr": "ocr",
"chart": "chart",
"multimodal": "multimodal",
"compare": "compare"
}
)
# 結束節點
workflow.add_edge("analyze", END)
workflow.add_edge("ocr", END)
workflow.add_edge("chart", END)
workflow.add_edge("multimodal", END)
workflow.add_edge("compare", END)
return workflow.compile()
from workflows.vision_workflow import create_vision_workflow
import os
def main():
"""多模態視覺助理主程式"""
print("👁️ 多模態視覺 AI 助理")
print("🔧 功能:圖片分析、OCR、圖表解讀、多模態問答")
print("=" * 55)
app = create_vision_workflow()
print("💡 指令範例:")
print(" • '分析這張圖片' + 圖片路徑")
print(" • '提取圖片中的文字' + 圖片路徑")
print(" • '這張圖表顯示什麼趨勢?' + 圖片路徑")
print(" • '比較這兩張圖片' + 兩張圖片路徑\n")
while True:
try:
# 獲取使用者查詢
query = input("💬 請描述您的需求:").strip()
if query.lower() in ['quit', 'exit', '退出']:
print("👋 再見!")
break
# 獲取圖片路徑
image_input = input("📸 請輸入圖片路徑(多個路徑用空格分隔):").strip()
if not image_input:
print("❌ 請提供圖片路徑")
continue
image_paths = []
for path in image_input.split():
if os.path.exists(path):
image_paths.append(path)
else:
print(f"⚠️ 文件不存在:{path}")
if not image_paths:
print("❌ 沒有有效的圖片路徑")
continue
print("🔍 分析中...")
# 執行視覺工作流程
result = app.invoke({
"user_query": query,
"image_paths": image_paths,
"processing_mode": "",
"vision_results": {},
"final_response": "",
"confidence": 0.0
})
print(f"\n{result['final_response']}")
if result.get('confidence', 0) > 0:
print(f"🎯 信心度:{result['confidence']:.2f}")
print("-" * 50)
except KeyboardInterrupt:
print("\n👋 再見!")
break
except Exception as e:
print(f"❌ 發生錯誤:{e}")
if __name__ == "__main__":
main()
✅ 全方位視覺理解:圖片描述、物體識別、場景分析
✅ 智能 OCR 功能:文字提取、表格識別
✅ 圖表專業分析:數據趨勢、洞察報告
✅ 多模態問答:結合文字和視覺的智能對話
✅ LangGraph 管理:清晰的視覺處理流程
今天我們成功為 AI 助理增加了「視覺」能力!透過 Gemini Vision 和多模態處理技術,助理現在能夠理解圖片、提取文字、分析圖表,實現真正的多感官智能互動。明天我們將學習錯誤處理與異常管理,確保系統的穩定性和可靠性!